import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
data=pd.read_csv('../data/external/Shakespeare_data.csv')
data.head()
data.info()
data["Player"].fillna("Other", inplace = True)
data.shape
data=data.dropna()
data.reset_index(drop=True, inplace=True)
data.shape
data.groupby(['Play', 'Player']).count()['Dataline']
numberofPlayers = data.groupby(['Play'])['Player'].nunique().sort_values(ascending= False).to_frame()
numberofPlayers['Play'] = numberofPlayers.index.tolist()
numberofPlayers.columns = ['Number of Players','Play']
numberofPlayers.index= np.arange(0,len(numberofPlayers))
numberofPlayers
plt.figure(figsize=(8,8))
ax = sns.barplot(x='Number of Players',y='Play',data=numberofPlayers)
ax.set(xlabel='Number of Players', ylabel='Shakespeare Play')
plt.show()
numberoflines = data.groupby(['Player'])['PlayerLine'].nunique().sort_values(ascending= False).to_frame()
numberoflines['Player'] = numberoflines.index.tolist()
numberoflines.columns = ['Number of lines','Player']
numberoflines.index= np.arange(0,len(numberoflines))
numberoflines.sample(10)
plt.figure(figsize=(10,15))
ax = sns.barplot(x='Number of lines',y='Player',data=numberoflines.head(45))
ax.set(xlabel='No of lines', ylabel='Player')
plt.show()
df1 = pd.DataFrame(data, columns = ['ActSceneLine','Play'])
df1.sample(5)
dataExpanded = data['ActSceneLine'].str.split('.', expand=True)
data['Act'] = dataExpanded[0]
data['Scene'] = dataExpanded[1]
data['SceneLine'] = dataExpanded[2]
data=data.drop(columns=['ActSceneLine'])
data.head()
numberofActs=data.groupby(['Player']).max()['Act'].sort_values(ascending= False).to_frame()
numberofActs['Player'] = numberofActs.index.tolist()
numberofActs.columns = ['Number of Acts','Player']
numberofActs.index= np.arange(0,len(numberofActs))
numberofActs.sample(10)
data.dtypes
numberofActs['Number of Acts'] = numberofActs['Number of Acts'].astype(int)
plt.figure(figsize=(10,15))
ax = sns.barplot(x='Number of Acts',y='Player',data=numberofActs.sample(100))
ax.set(xlabel='Number of Acts', ylabel='Player')
plt.show()
data['PlayerLine'] = data['PlayerLine'].str.lower()
punctuation_marks = list("?:!.,;")
for punct_mark in punctuation_marks:
data['PlayerLine'] = data['PlayerLine'].str.replace(punct_mark, '')
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')
print("------------------------------------------------------------")
nltk.download('stopwords')
wordnet_lemmatizer = WordNetLemmatizer()
numberofplayerLines = len(data)
lemmatized_text_list = []
for row in range(0, numberofplayerLines):
lemmatized_word_list = []
dialogue = data.loc[row]['PlayerLine']
dialogue_words = dialogue.split(" ")
for word in dialogue_words:
lemmatized_word_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
lemmatized_text = " ".join(lemmatized_word_list)
lemmatized_text_list.append(lemmatized_text)
data['PlayerLine'] = lemmatized_text_list
data.head()
stop_words = list(stopwords.words('english'))
for stop_word in stop_words:
regex_stopword = r"\b" + stop_word + r"\b"
data['PlayerLine'] = data['PlayerLine'].str.replace(regex_stopword, '')
data['Word_tokens']=data['PlayerLine'].apply(lambda x:' '.join(w for w in nltk.word_tokenize(x.lower().strip()) if not w in stop_word) )
data.Word_tokens.head()
data_play_content=pd.DataFrame(data.groupby('Player')['Word_tokens'].apply(lambda x: "{%s}" % ', '.join(x)))
data_play_content.head(10)
from wordcloud import WordCloud
def WordCloudGeneration(str1,title):
plt.figure(figsize=(20, 10))
wordcloudoflines = WordCloud( background_color='black',width=600, height=400, max_font_size=50).generate(str1)
wordcloudoflines.recolor(random_state=0)
plt.title(title, fontsize=60,color='red')
plt.imshow(wordcloudoflines)
plt.axis('off')
plt.show()
for cols in data_play_content.sample(20).index:
WordCloudGeneration(data_play_content.loc[cols,'Word_tokens'],cols)
Considering only the top 100 features across the corpus
ngram_range = (1,2)
max_features = 100
min_df = 10
max_df = 20
tfidf = TfidfVectorizer(encoding='utf-8',
ngram_range=ngram_range,
lowercase=True,
norm='l2',
max_features=max_features,
max_df=max_df,
min_df=min_df,
sublinear_tf=True)
featuers_labels = tfidf.fit_transform(data['PlayerLine']).toarray()
f=pd.DataFrame(data=featuers_labels[0:,0:], index=[i for i in range(featuers_labels.shape[0])],columns=['f'+str(i) for i in range(featuers_labels.shape[1])])
f['Act'] = data['Act']
f['Scene'] = data['Scene']
f['SceneLine'] = data['SceneLine']
f['Play']=data['Play']
f = pd.get_dummies(f, columns=['Play'])
dftarget = pd.DataFrame(data, columns = [ 'Player'])
f.sample(10)
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(f, dftarget, test_size=0.2)
print(xtrain.shape, xtest.shape)
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
model = GaussianNB()
dc = DecisionTreeClassifier()
rf = RandomForestClassifier()
model.fit(xtrain, ytrain)
dc.fit(xtrain, ytrain)
rf.fit(xtrain, ytrain)
from sklearn.metrics import accuracy_score
print('Accuracy for GaussianNB: ', 100 * accuracy_score( ytest, model.predict(xtest)), '%')
print('Accuracy for Decision Tree: ', 100 * accuracy_score(ytest, dc.predict(xtest)), '%')
print('Accuracy for random forest: ', 100 * accuracy_score(ytest, rf.predict(xtest)), '%')
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xtrain, ytrain)
print('Accuracy for nearest neighbour: ',100*accuracy_score(ytest, knn.predict(xtest)),'%')
dc1 = DecisionTreeClassifier(max_depth=10)
dc2 = DecisionTreeClassifier(max_depth=25)
dc3 = DecisionTreeClassifier(max_depth=50)
dc4 = DecisionTreeClassifier(max_depth=100)
dc5 = DecisionTreeClassifier(max_depth=150)
dc1.fit(xtrain , ytrain)
dc2.fit(xtrain , ytrain)
dc3.fit(xtrain , ytrain)
dc4.fit(xtrain , ytrain)
dc5.fit(xtrain , ytrain)
print('Max Depth of 10 :', 100*accuracy_score(ytest, dc1.predict(xtest)), '%')
print(accuracy_score(ytest, dc2.predict(xtest)))
print(accuracy_score(ytest, dc3.predict(xtest)))
print(accuracy_score(ytest, dc4.predict(xtest)))
print(accuracy_score(ytest, dc5.predict(xtest)))